// --------------------------------------------------------------------------
// Prepare data for computing the cluster points that determine the centers
// This version: May 17, 2024
// --------------------------------------------------------------------------


clear
clear all

//local data_path "./raw"
local compute_path "/Users/kristianbehrens/Desktop/programming/clustering"


local cmauid_list1 "1 5 15 11 10 105 110 210 205 215 220 225 310 320 305 329 335 330 328 403 404 405 421 423 428 430 444 442 440 433 437 450 447 462 454 452 456 465 468 502 505 485 480 411 410 408 406 412 501 512 509 521 522 527 528 529 530 535 532 550 531 537 539 547 543 541 553 544 533 546 555 556 559 557 562 566 571 568 565 567 569 516 515 507 575 580 586 590 582 595 598 602 605 603 610 607 640 750 755 705 715 720 710 725 735 745 840 805 810 806 826 825 820 821 830 831 832 833 835 865 845 828 860 850 905 907 913 932 930 933 935 937 939 938 940 944 943 945 934 925 915 918 920 952 950 955 965 970 975 977 990 995"

local cmauid_list2 "5 15 11 10 105 110 210 205 215 220 225 310 320 305 329 335 330 328 403 404 405 421 423 428 430 444 442 440 433 437 450 447 462 454 452 456 465 468 502 505 485 480 411 410 408 406 412 501 512 509 521 522 527 528 529 530 535 532 550 531 537 539 547 543 541 553 544 533 546 555 556 559 557 562 566 571 568 565 567 569 516 515 507 575 580 586 590 582 595 598 602 605 603 610 607 640 750 755 705 715 720 710 725 735 745 840 805 810 806 826 825 820 821 830 831 832 833 835 865 845 828 860 850 905 907 913 932 930 933 935 937 939 938 940 944 943 945 934 925 915 918 920 952 950 955 965 970 975 977 990 995"

cd "`compute_path'"


foreach l in `cmauid_list1' {

	// Drop all rural dissemination areas and areas where we have no density info
	
	use "`compute_path'/raw/census2016.dta", clear
	keep if !missing(cmauid)
	drop if popdense2016_census == . | pop2016 == . | landarea2016_census == . | landarea2016_census == 0
	
	keep if cmauid == `l'
	order dauid lat lon
	
	insobs 1, before(1)
	destring dauid, replace

	drop popdense2016_census
	gen popdense2016_census = pop2016/landarea2016_census 
	
	qui su popdense2016_census, d

	gen dense = popdense2016_census > r(p75) if !missing(popdense2016_census)
	
	order cmauid lat lon dense pop2016 dauid
	
	count
	replace cmauid = r(N) in 1
	
	count if dense == 1
	replace lat = cmauid[1] - r(N) in 1
	
	//order cmauid cmauid lat lon dense pop2016
	keep cmauid lat lon dense pop2016

	export delimited using "`compute_path'/input/cma_`l'_p75.txt", delimiter(" ") novarnames replace
}


// *****************************
// [[ RUN C code here ]]
// Import the result: output.txt
// *****************************


foreach l in `cmauid_list1' {
	
	import delimited "`compute_path'/output/cma_`l'_p75_out.txt", delimiter(",") encoding(UTF-8) clear  
	
	gen cmauid= `l'
	
	gen weight = other_w + own_w
	egen totweight_other = total(other_w) 
	egen totweight_own = total(own_w)
	gen totweight = totweight_other + totweight_own
	gen popshare = weight/totweight
	
	drop if pval > 0.01
	
	qui su popshare, d
	keep if popshare >= r(p95)
	
	keep id lat lon weight cmauid popshare
	
	order cmauid lat lon weight cmauid popshare id
	save "`compute_path'/output/clusters_`l'_p75.dta", replace
	
}


use "`compute_path'/output/clusters_1_p75.dta", clear

foreach l in `cmauid_list2' {
	
	append using "`compute_path'/output/clusters_`l'_p75.dta"
	
}

export delimited using "`compute_path'/output/clusters.txt", delimiter(",") replace



// ****************************************
// [[ GIS PROCESSING OF THE DATA ]]
//
// This is a manual step where we determine all single clusters
// and merge those that overlap. See the paper for details.
//
// Import the file "`compute_path'/output/clusters.txt" into GIS and process the data.
//
// The resulting shapefiles are in: ./output_shp
//
// The csv filers are in ./output/centers/single_centers.csv
// and ./output/centers/merged_centers.csv
//
// Import the result: single_centers.csv
//					  merged_centers.csv
// ****************************************

// Combine the data

import delimited using "`compute_path'/output/centers/single_centers.csv", delimiter(",") clear

keep cmauid ceny cenx weight	// Note: Use ceny and cenx for lat lon (the lat lon in the merged files are not correct, they got summed)
order cmauid ceny cenx weight
ren ceny lat
ren cenx lon

save "`compute_path'/temp/temp.dta", replace


import delimited using "`compute_path'/output/centers/merged_centers.csv", delimiter(",") clear

keep cmauid ceny cenx weight	// Note: Use ceny and cenx for lat lon (the lat lon in the merged files are not correct, they got summed)
order cmauid ceny cenx weight
ren ceny lat
ren cenx lon

append using "`compute_path'/temp/temp.dta"

sort cmauid

export delimited using "`compute_path'/output/centers/centers_cpp_input.txt", delimiter(" ") replace novarnames

